1 module dataframe.hdf5;
2 import dataframe.common;
3 import dataframe.typed;
4 import hdf5.hdf5;
5 import dataframe.hdf5util;
6 import std.conv;
7 import std.csv;
8 import std.datetime;
9 import std.exception;
10 import std.range:array, stride,only;
11 import std.stdio;
12 import std.variant;
13 import std.string:isNumeric;
14 alias KalVariant=Algebraic!(string,int,long, DateTime, float,double);
15 import std.typecons:tuple,Tuple;
16 static import std.traits;
17 
18 enum CHUNKSIZE=260;
19 alias DataTypes=Tuple!(string[],"columnTitles", ColumnType[],"columnTypes",int[],"offsets",int[],"sizes",int,"totalSize");
20 // ColumnType[]
21 
22 
23 hid_t createDataType(DataFrameTyped frame, string name="")
24 {
25 	auto tid=H5T.create(H5TClass.Compound,frame.columnSizeOf);
26 	long offset=0L;
27 	foreach(colTitle;frame.columnTitles)
28 	{
29 		//writefln("%s,%s,%s",colTitle,offset,frame.columnTypes[colTitle].columnSizeOf);
30 		H5T.insert(tid,colTitle,offset,frame.columnTypes[colTitle].toH5Type);
31 		offset+=frame.columnTypes[colTitle].columnSizeOf;
32 	}
33 	return tid;
34 }
35 
36 size_t columnSizeOf(ColumnType[] types)
37 {
38 	size_t ret;
39 	foreach(type;types)
40 		ret+=type.columnSizeOf;
41 	return ret;
42 }
43 
44 size_t columnSizeOf(DataFrameTyped frame)
45 {
46 	size_t ret;
47 	foreach(title;frame.columnTitles)
48 		ret+=frame.columnTypes[title].columnSizeOf;
49 	return ret;
50 }
51 
52 size_t columnSizeOf(ColumnType type)
53 {
54 	switch(type) with(ColumnType)
55 	{
56 		case Int:
57 			return int.sizeof;
58 		case Long:
59 			return long.sizeof;
60 		case Double:
61 			return double.sizeof;
62 		case Date:
63 			return std.datetime.Date.sizeof;
64 		case DateTime:
65 			return std.datetime.DateTime.sizeof;
66 		default:
67 			throw new Exception("unknown type: "~type.to!string);
68 	}
69 }
70 
71 
72 ubyte[] toBytes(DataFrameTyped frame)
73 {
74 	auto colBytes=frame.columnSizeOf;
75 	ubyte[] ret = new ubyte[colBytes*frame.numRows];
76 	foreach(row;0..frame.numRows)
77 	{
78 		auto rowOffset=row*colBytes;
79 		auto cellOffset=rowOffset;
80 		foreach(colTitle;frame.columnTitles)
81 		{
82 			switch(frame.columnTypes[colTitle]) with(ColumnType)
83 			{
84 				case Int:
85 					*(cast(int*)&ret[cellOffset])=frame.values.ints[colTitle][row];
86 					cellOffset+=int.sizeof;
87 					break;
88 				case Long:
89 					*(cast(long*)&ret[cellOffset])=frame.values.longs[colTitle][row];
90 					cellOffset+=long.sizeof;
91 					break;
92 				case Double:
93 					*(cast(double*)&ret[cellOffset])=frame.values.doubles[colTitle][row];
94 					cellOffset+=double.sizeof;
95 					break;
96 				case Date:
97 					cellOffset+=std.datetime.Date.sizeof;
98 					break;
99 				case DateTime:
100 					cellOffset+=std.datetime.DateTime.sizeof;
101 					break;
102 				default:
103 					break;
104 			}
105 		}
106 	}
107 	return ret;
108 }
109 
110 auto dataFrameTypedFromFloats(float[] data,string[] columnTitles)
111 {
112 	DataFrameTyped ret;
113 	ColumnType[] columnTypes;
114 	auto numCols=columnTitles.length;
115 	auto numRows=data.length/numCols;
116 	ret.setColumnTitles(columnTitles);
117 	columnTypes.length=numCols;
118 	foreach(ref type;columnTypes)
119 		type=ColumnType.Double;
120 	ret.setColumnTypes(columnTypes);
121 	ret.setRows(numRows);
122 	foreach(row;0..data.length/numCols)
123 	{
124 		foreach(col;0..numCols)
125 		{
126 			ret[row,columnTitles[col]]=data[row*numCols+col].to!double;
127 		}
128 	}
129 	return ret;
130 }
131 
132 DataTypes dataTypesForHDF5(string filename, string datasetName)
133 {
134 	string[] names;
135 	ColumnType[] types;
136 	int[] offsets,sizes;
137 	auto file = H5F.open(filename, H5F_ACC_RDONLY, H5P_DEFAULT);
138 	auto dataset = H5D.open2(file, datasetName, H5P_DEFAULT);
139 	auto s1_tid = H5D.get_type(dataset);
140 	ColumnType type;
141 	switch(H5T.get_class(s1_tid)) with (H5TClass)
142 	{
143 		case Integer:
144 			type=ColumnType.Int;
145 			auto ord = H5Tget_order(type);
146 			auto sgn = H5Tget_sign(type);
147 			auto sz = H5Tget_size(type).to!int;
148 			writefln("Integer byte order = %s",ord); // H5TOrderLE or BE
149 			writefln("Integer sign = %s",sgn); // H5T SGN None or 2
150   			writefln("Integer size = %s",sz);
151   			return DataTypes([],[ColumnType.Int],[],[sz],sz);
152   		case Float:
153   			return DataTypes([],[ColumnType.Double],[],[],0);
154 	 	case Compound:
155 		    auto sz = H5Tget_size(s1_tid).to!int;
156 		    auto nmemb = H5Tget_nmembers(s1_tid);
157 			writefln("  %s bytes",sz);
158 			writefln("  %s members",nmemb);
159 			foreach(i;0..nmemb)
160 			{
161 		        auto s2_tid = H5T.get_member_type(s1_tid, i);
162 		        enforce(H5Tget_class(s2_tid) != H5TClass.Compound);
163 				enforce(H5T.get_class(s2_tid) != H5TClass.Array);
164 		        writefln("    %s: type code %s offset %s size %s",
165 		                      H5T.get_member_name(s1_tid, i),
166 		                      H5T.get_class(s2_tid),
167 		                      H5T.get_member_offset(s1_tid, i),
168 		                      H5T.get_size(s2_tid));
169 		        names~=H5T.get_member_name(s1_tid, i);
170 		        types~=H5T.get_class(s2_tid).h5ClassToColumnType(H5T.get_size(s2_tid).to!int);
171 		        offsets~=H5T.get_member_offset(s1_tid, i).to!int,
172 		        sizes~=H5T.get_size(s2_tid).to!int;
173 		    }
174 		    writefln("returning: %s,%s,%s,%s,%s",names,types,offsets,sizes,sz);
175 		    stdout.flush;
176 		    return DataTypes(names,types,offsets,sizes,sz);
177 		default:
178   			return DataTypes([],[],[],[],0);
179 	}
180 }
181 
182 ColumnType h5ClassToColumnType(H5TClass classType, int len)
183 {
184 	switch(classType) with(H5TClass)
185 	{
186 		case Integer:
187 			switch(len)
188 			{
189 				case 1,2,4:
190 					return ColumnType.Int;
191 				case 8:
192 					return ColumnType.Long;
193 				default:
194 					throw new Exception("weird length: "~len.to!string);
195 			}
196 		case Float:
197 			return ColumnType.Double;
198 		default:
199 			throw new Exception("unknown HDF5 class: "~classType.to!string);
200 	}
201 	assert(0);
202 }
203 hid_t toH5Type(ColumnType type)
204 {
205 	switch(type) with(ColumnType)
206 	{
207 		case Int:
208 			return H5T_NATIVE_INT;
209 		case Long:
210 			return H5T_NATIVE_LLONG;
211 		case Double:
212 			return H5T_NATIVE_DOUBLE;
213 		default:
214 			throw new Exception("unknown type: "~ type.to!string);
215 	}
216 }
217 
218 DataFrameTyped  dataFrameTypedFromHDF5DataSet(string filename,string datasetName)
219 {
220 	auto file = H5F.open(filename, H5F_ACC_RDONLY, H5P_DEFAULT);
221 	auto dataset = H5D.open2(file, datasetName, H5P_DEFAULT);
222 
223 	auto dataType  = H5D.get_type(dataset);     /* datatype handle */
224 	auto t_class     = H5T.get_class(dataType);
225 	auto order     = H5T.get_order(dataType);
226 	auto size  = H5T.get_size(dataType);
227 	auto dataspace = H5D.get_space(dataset);    /* dataspace handle */
228 	auto rank      = H5S.get_simple_extent_ndims(dataspace);
229 	hsize_t[2]     dims_out;
230 	auto status_n  = H5S.get_simple_extent_dims(dataspace, dims_out);
231 	enforce(rank==1,
232 		new Exception("only handle vector ie rank 1 tables currently and rank="~to!string(rank)));
233 	writefln("dims=%s",dims_out);
234 	writefln("size=%s",size);
235 	writefln("total=%s",size*dims_out[0]);
236 	stdout.flush;
237 	auto data = new ubyte[dims_out[0]*size];
238 	H5D.read(dataset, dataType, H5S_ALL, H5S_ALL, H5P_DEFAULT, data.ptr);
239 	//debug writefln("%s", "read passed");
240 	H5T.close(dataType);
241 	H5S.close(dataspace);
242 	H5D.close(dataset);
243 	DataFrameTyped ret;
244 	auto meta=dataTypesForHDF5(filename,datasetName);
245 	ret.setColumnTitles(meta.columnTitles);
246 	ret.setColumnTypes(meta.columnTypes);
247 	foreach(row;0..dims_out[0])
248 	{
249 		auto rowOffset=meta.totalSize*row;
250 		int j=0;
251 		auto cellOffset=rowOffset;
252 		foreach(colTitle;ret.columnTitles)
253 		{
254 			cellOffset=rowOffset+meta.offsets[j];
255 			//writefln("%s,%s,%s,%s,%s,%s",row,j,cellOffset,colTitle,ret.columnTypes[colTitle],meta.sizes[j]);
256 			stdout.flush;
257 			switch(ret.columnTypes[colTitle])
258 			{
259 				case ColumnType.Int,ColumnType.Long:				
260 					switch(meta.sizes[j])
261 					{
262 						case 1:
263 							ret.values.ints[colTitle]~=(*(cast(char*)(&data[cellOffset]))).to!int;
264 							break;
265 						case 2:
266 							ret.values.ints[colTitle]~=(*(cast(ushort*)(&data[cellOffset]))).to!int;
267 							break;
268 						case 4:
269 							ret.values.ints[colTitle]~=(*(cast(int*)(&data[cellOffset])));
270 							break;
271 						case 8:
272 							ret.values.longs[colTitle]~=*cast(long*)(&data[cellOffset]);
273 							break;
274 						default:
275 							writefln("skipping unknown field len: %s",colTitle);
276 							break;
277 					}
278 					break;
279 				case ColumnType.Double:
280 					switch(meta.sizes[j])
281 					{
282 						case 4:
283 							ret.values.doubles[colTitle]~=(*cast(float*)&data[cellOffset]).to!double;
284 							break;
285 						case 8:
286 							ret.values.doubles[colTitle]~=*cast(double*)&data[cellOffset];
287 							break;
288 						default:
289 							writefln("skipping unknown field len: %s",meta.sizes[j]);
290 							break;
291 					}
292 					break;
293 				default:
294 					writefln("skipping %s",ret.columnTypes[colTitle]);
295 					break;
296 			}
297 			++j;
298 		}
299 		++ret.numRows;
300 	}
301 	return ret;
302 }
303 
304 
305 
306 
307 DataFrameTyped toHDF5(DataFrameTyped frame, string filename, string datasetName, DumpMode mode=DumpMode.append,
308 	bool extensible=true)
309 {
310 	import std.file:exists;
311 	hid_t file;
312 	bool fileExists=filename.exists;
313 	if (fileExists)
314 		file=H5F.open(filename,H5F_ACC_RDWR, H5P_DEFAULT);
315 	else
316 		file = friendlyH5Create(filename,100*1024*1024,true);
317 	//H5F.create(filename, H5F_ACC_TRUNC , H5P_DEFAULT, H5P_DEFAULT);
318 
319 	hsize_t[1] chunk_dims =[CHUNKSIZE];
320     auto dataType = frame.createDataType;
321     ubyte[] junk;
322     junk.length=H5T.get_size(dataType);
323     writefln("%s data set length", junk.length);
324 	auto data=frame.toBytes;
325     writefln("%s data set bytes", data.length);
326 	hsize_t[]  dim = [frame.numRows];
327 	//auto space = H5S.create_simple(dim);
328     if ((H5L.exists(file,datasetName,H5P_DEFAULT))) // does file contain our dataset
329 	{
330 		auto dataset = H5D.open2(file, datasetName, H5P_DEFAULT);
331 		if ((mode==DumpMode.append) || (mode==DumpMode.truncate))
332 		{
333 			// we should check here that it is an extensible dataset
334 			auto dataTypeData  = H5D.get_type(dataset);     /* datatype handle */
335 			auto t_class     = H5T.get_class(dataTypeData);
336 			auto order     = H5T.get_order(dataTypeData);
337 			auto size  = H5T.get_size(dataTypeData);
338 			auto dataspace = H5D.get_space(dataset);    /* dataspace handle */
339 			auto rank      = H5S.get_simple_extent_ndims(dataspace);
340 			hsize_t[1]     dims_out,   offset;
341 			auto status_n  = H5S.get_simple_extent_dims(dataspace, dims_out);
342 			switch(mode)
343 			{
344 				case DumpMode.append:	dim=[dims_out[0]+frame.numRows];
345 								offset[0] = dims_out[0];
346 	    							break;
347 				case DumpMode.truncate:	dim=[frame.numRows];
348 								offset[0]=0;
349 								break;
350 				default:				assert(0);
351 			}
352 			H5D.set_extent(dataset, dim);
353 			auto filespace = H5D.get_space(dataset); 
354 	    	auto dim2=[frame.numRows];
355 			H5S.select_hyperslab(filespace, H5SSeloper.Set, offset, dim2);
356 			auto dataspace2 = H5S.create_simple(dim2);
357 			H5D.write(dataset, dataType, dataspace2, filespace, H5P_DEFAULT, cast(ubyte*)data.ptr);
358 			H5T.close(dataType);
359 		    H5S.close(dataspace2);
360 			H5D.close(dataset);
361 			return frame;
362 		}
363 		else // need to destroy dataset but keep others in this file
364 		{
365 			enforce(mode==DumpMode.unlink);
366 			H5L.h5delete(file,datasetName,H5P_DEFAULT);
367 		}  			
368 	}
369 			
370 	hsize_t[1] maxdims = extensible?[H5S_UNLIMITED]:[frame.numRows];
371 	
372 	auto cparms = H5P.create(H5P_DATASET_CREATE); // Modify dataset creation properties, i.e. enable chunking.
373 	//debug writefln("* h5p simple created"); stdout.flush;
374 	H5P.set_chunk( cparms, chunk_dims);
375     //debug writefln("* h5p set chunk"); stdout.flush;
376     auto dataspace = H5S.create_simple(dim, maxdims);
377 	debug writefln("* h5s simple created"); stdout.flush;
378 	//auto cparms = H5P.create(H5P_DATASET_CREATE); // Modify dataset creation properties, i.e. enable chunking.
379     H5P.set_fill_value (cparms, dataType, cast(void*)&junk);
380     //auto cparms=H5P_DEFAULT;
381     debug writefln("* creating dataset");
382     auto dataset = H5D.create2(file, datasetName, dataType, dataspace, H5P_DEFAULT, cparms, H5P_DEFAULT);
383     // tried to disable the above - what follows on this line is wrong auto dataset = H5D.create2(file, datasetName, dataType, dataspace, H5P_DEFAULT,H5P_DEFAULT, H5P_DEFAULT);
384     debug writefln("* dataset created");
385 	auto filespace = H5D.get_space(dataset); 
386 	debug writefln("* writing data");
387     H5D.write(dataset, dataType, dataspace,filespace, H5P_DEFAULT, cast(ubyte*)data.ptr);
388     //H5D.write(dataset,dataType,H5S_ALL,H5S_ALL,H5P_DEFAULT,cast(ubyte*)data.ptr);
389     debug writefln("* finished writing data");
390 	H5T.close(dataType);
391     H5S.close(dataspace);
392 	H5D.close(dataset);
393 	//H5D.close(filespace);
394 	debug writefln("* finished closing objects");
395 	return frame;
396 }
397 
398 
399 
400 
401 auto dataFrameTypedFromSimpleHDF5Array(string filename, string groupName, string ticker, string[] columnTitles)
402 {
403 	import std.stdio:writef,writefln;
404 	import std.file:exists;
405 	hsize_t[2]     dims;
406 	float[] data;
407 	H5open();
408 	H5_init_library();
409 	enforce(exists(filename),new Exception(filename~" does not exist!"));
410 	auto file=H5F.open(filename,H5F_ACC_RDWR, H5P_DEFAULT);
411 	//auto groupID = (groupName !is null)?H5G.open2(file, groupName, H5P_DEFAULT):file;
412 	auto groupID = H5G.open2(file, groupName, H5P_DEFAULT);
413 	//enforce(dataSetExists(groupID,ticker), new Exception(filename~" does not contain "~ticker~"!"));
414 	writefln("GT=%s/%s",groupName,ticker);
415 	stdout.flush;
416 	auto dataset = H5D.open2(groupID, ticker, H5P_DEFAULT);
417 	auto dataspace = H5D.get_space(dataset);    /* dataspace handle */
418 	auto rank      = H5S.get_simple_extent_ndims(dataspace);
419 	auto status  = H5S.get_simple_extent_dims(dataspace, dims);
420  	enforce(dims[1]==columnTitles.length);
421  	writefln("dims = %s,rows= %s, columnTitles = %s",dims[1],dims[0],columnTitles.length);
422   	data.length=dims[0]*dims[1];
423     H5D.read(dataset, H5T_NATIVE_FLOAT, H5S_ALL, H5S_ALL, H5P_DEFAULT, cast(ubyte*)data.ptr);
424     H5G.close(groupID);
425     H5F.close(file);
426 	return data.dataFrameTypedFromFloats(columnTitles);
427 }